Regression

source("functions.R")

On commence par importer le jeu de données et on vérifie si il y’a des valeurs manquantes, ce qui n’est pas le cas. On peut donc continuer avec l’analyse des données en vérifiant le type des variables:

On va transformer bonus_malus en binaire et retirer les variables qui ne sont pas utiles pour la prédiction comme PoliId.

library(rmarkdown)
library(dplyr)

# importation des données
train <- read.csv("./data/train_set.csv", header = T, sep = ",", dec = ".")
test <- read.csv("./data/test_set.csv", header = T, sep = ",", dec = ".")



# valeurs manquantes
sum(is.na(train))
## [1] 0
# On va transformer bonus_malus en binaire
train$Bonus_Malus <- ifelse(train$Bonus_Malus < 100, "Bonus", "Malus")
test$Bonus_Malus <- ifelse(test$Bonus_Malus < 100, "Bonus", "Malus")
train <- train %>%
    select(-PolID)
test <- test %>%
    select(-PolID)

# appercu des données
paged_table(train)

On peut maintenant continuer avec l’analyse des données en vérifiant le type des variables:

library(kableExtra)

variables <- classifier_variables_tab(train)
numeric_variables <- data.frame("variables_numériques"=variables$variables_numeriques)
categorical_variables <- data.frame("variables_catégorielles"=append(variables$variables_categorielles,variables$variables_binaires))

#categorical_variables %>%
          kable(categorical_variables) %>% 
          kable_styling(
              bootstrap_options = c("striped", "hover", "condensed", "responsive"),
              full_width = FALSE,
                )
variables_catégorielles
Car_Model
Urban_rural_class
French_region
Bonus_Malus
Car_Fuel
#numeric_variables %>%
  kable(numeric_variables) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive"),
    full_width = FALSE
    
  )
variables_numériques
Claim
Period_Exp
Car_Power
Car_Age
Age
Inhab_density
#On va convertir les variables catégorielles en facteur on Obtient alors:


variables <- classifier_variables_tab(train)
numeric_variables <- variables$variables_numeriques
categorical_variables <- append(variables$variables_categorielles, variables$variables_binaires)
#convertir les varianles catégorielles en factor
train[categorical_variables] <- lapply(train[categorical_variables], factor)
test[categorical_variables] <- lapply(test[categorical_variables], factor)
str(train)
## 'data.frame':    542389 obs. of  11 variables:
##  $ Claim            : int  4 5 8 4 11 4 0 0 0 0 ...
##  $ Period_Exp       : num  0.56 1 0.41 0.27 0.08 0.1 0.96 0.73 0.09 0.73 ...
##  $ Car_Power        : int  4 7 4 5 4 4 14 10 4 5 ...
##  $ Car_Age          : int  4 9 12 9 13 1 25 2 12 4 ...
##  $ Age              : int  46 67 52 23 53 31 49 38 27 32 ...
##  $ Bonus_Malus      : Factor w/ 2 levels "Bonus","Malus": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Car_Model        : Factor w/ 11 levels "B1","B10","B11",..: 9 7 1 8 1 4 2 4 10 8 ...
##  $ Car_Fuel         : Factor w/ 2 levels "Diesel","Regular": 1 1 2 1 2 2 2 1 2 1 ...
##  $ Urban_rural_class: Factor w/ 6 levels "A","B","C","D",..: 1 5 4 5 4 5 5 3 3 3 ...
##  $ Inhab_density    : int  29 4762 824 6924 824 2983 5053 160 229 461 ...
##  $ French_region    : Factor w/ 22 levels "Alsace","Aquitaine",..: 7 21 13 12 13 17 12 20 6 6 ...

##Étude des variables catégorielles:

0.1 Car Model

plot_categorical(train, "Car_Model")
## [1] "Car_Model"

plot_percentage(train, "Car_Model")

plot_hist_by_claim(train, "Car_Model")

0.2 Bonus_Malus

plot_categorical(train, "Bonus_Malus")
## [1] "Bonus_Malus"

plot_percentage(train, "Bonus_Malus")

plot_hist_by_claim(train, "Bonus_Malus")

0.3 Urban_rural_class

plot_categorical(train, "Urban_rural_class")
## [1] "Urban_rural_class"

plot_percentage(train, "Urban_rural_class")

plot_hist_by_claim(train, "Urban_rural_class")

0.4 Car_Fuel

plot_categorical(train, "Car_Fuel")
## [1] "Car_Fuel"

plot_percentage(train, "Car_Fuel")

plot_hist_by_claim(train, "Car_Fuel")

library(vcd)
mosaic(~Car_Fuel + Bonus_Malus, data = train, shade = TRUE)

# assocplot(table(train$Car_Fuel, train$Bonus_Malus))

0.5 French_region

source("functions.R")
plot_categorical(train, "French_region")
## [1] "French_region"

plot_percentage(train, "French_region")

plot_hist_by_claim(train, "French_region")

plot_claims_by_region(train, "./data/regions-avant-redecoupage-2015.geojson")

1 Étude des variables numériques

1.1 Inhab_density

plot_numeric <- function(data, variable) {
    p1 <- ggplot(data, aes_string(x = variable)) + geom_histogram(aes(y = ..density..),
        bins = 30, fill = "lightblue", color = "black") + geom_density(alpha = 0.2,
        fill = "#FF6666") + labs(title = paste("Distribution de la variable", variable)) +
        theme_bw()
    p2 <- ggplot(data, aes_string(x = variable)) + geom_boxplot(fill = "lightblue",
        color = "black") + labs(title = paste("Boxplot de la variable", variable)) +
        theme_bw()

    print(p1)
    print(p2)
}

box_plot <- function(data, col) {
    data$Claim <- as.factor(data$Claim)

    p1 <- ggplot(data, aes(x = Claim, y = .data[[col]], fill = Claim)) + geom_boxplot() +
        labs(title = paste("Distribution de", col, " par Claim"), x = "Claim", y = col) +
        theme_bw()

    # Histogram with 20 bins Histogram
    p2 <- ggplot(data, aes(x = .data[[col]], fill = Claim)) + geom_histogram(color = "black",
        bins = 20, alpha = 1) + labs(title = paste("Histogramme de", col, "par Claim"),
        x = col, y = "Nombre") + theme_bw()

    return(p2)
}


# plot_numeric(train,'Inhab_density')
box_plot(train, "Inhab_density")

print(sum(train$Inhab_density < 1))
## [1] 0
ggplot(train, aes(x = Inhab_density, y = Claim)) + geom_point(alpha = 0.6, color = "darkorange") +
    labs(title = "Relation entre densité de population et nombre de sinistres",
        x = "Densité (habitants/km²)", y = "Nombre de sinistres") + theme_minimal()

### Age

plot_numeric(train, "Age")

print(sum(train$Age > 80))
## [1] 4943